Importing pandas and plotly express
import pandas as pd
import plotly.express as px
df = pd.read_csv('Airbnb_Open_Data.csv')
C:\Users\DELL\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3146: DtypeWarning: Columns (25) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
df.columns
Index(['id', 'NAME', 'host id', 'host_identity_verified', 'host name',
'neighbourhood group', 'neighbourhood', 'lat', 'long', 'country',
'country code', 'instant_bookable', 'cancellation_policy', 'room type',
'Construction year', 'price', 'service fee', 'minimum nights',
'number of reviews', 'last review', 'reviews per month',
'review rate number', 'calculated host listings count',
'availability 365', 'house_rules', 'license'],
dtype='object')
Dropping columns not needed/ not significant
df.drop(columns={'host id','host name','lat','long', 'country code','house_rules','license'}, inplace = True)
df.head(5)
| id | NAME | host_identity_verified | neighbourhood group | neighbourhood | country | instant_bookable | cancellation_policy | room type | Construction year | price | service fee | minimum nights | number of reviews | last review | reviews per month | review rate number | calculated host listings count | availability 365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001254 | Clean & quiet apt home by the park | unconfirmed | Brooklyn | Kensington | United States | False | strict | Private room | 2020.0 | $966 | $193 | 10.0 | 9.0 | 10/19/2021 | 0.21 | 4.0 | 6.0 | 286.0 |
| 1 | 1002102 | Skylit Midtown Castle | verified | Manhattan | Midtown | United States | False | moderate | Entire home/apt | 2007.0 | $142 | $28 | 30.0 | 45.0 | 5/21/2022 | 0.38 | 4.0 | 2.0 | 228.0 |
| 2 | 1002403 | THE VILLAGE OF HARLEM....NEW YORK ! | NaN | Manhattan | Harlem | United States | True | flexible | Private room | 2005.0 | $620 | $124 | 3.0 | 0.0 | NaN | NaN | 5.0 | 1.0 | 352.0 |
| 3 | 1002755 | NaN | unconfirmed | Brooklyn | Clinton Hill | United States | True | moderate | Entire home/apt | 2005.0 | $368 | $74 | 30.0 | 270.0 | 7/5/2019 | 4.64 | 4.0 | 1.0 | 322.0 |
| 4 | 1003689 | Entire Apt: Spacious Studio/Loft by central park | verified | Manhattan | East Harlem | United States | False | moderate | Entire home/apt | 2009.0 | $204 | $41 | 10.0 | 9.0 | 11/19/2018 | 0.10 | 3.0 | 1.0 | 289.0 |
Finding null values in each column
for i in df.columns:
print(df[i].isnull().mean())
0.0 0.002436670922718545 0.002816791586662638 0.00028265382703535123 0.00015594693905398687 0.005185235723545064 0.001023401787541789 0.0007407479605064377 0.0 0.0020857903098470743 0.0024074308716459224 0.002660844647608651 0.00398639362956754 0.001783643115429975 0.15490404389906334 0.1547675903273911 0.0031774188832249827 0.0031091920973888632 0.004366514293511633
Filling null values with 'unknown' for string based columns
df.iloc[:,1:8] = df.iloc[:,1:8].fillna('Unknown')
Removing all rows where 'last review' is null
df = df[df['last review'].notna()]
df.head(3)
| id | NAME | host_identity_verified | neighbourhood group | neighbourhood | country | instant_bookable | cancellation_policy | room type | Construction year | price | service fee | minimum nights | number of reviews | last review | reviews per month | review rate number | calculated host listings count | availability 365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001254 | Clean & quiet apt home by the park | unconfirmed | Brooklyn | Kensington | United States | False | strict | Private room | 2020.0 | $966 | $193 | 10.0 | 9.0 | 10/19/2021 | 0.21 | 4.0 | 6.0 | 286.0 |
| 1 | 1002102 | Skylit Midtown Castle | verified | Manhattan | Midtown | United States | False | moderate | Entire home/apt | 2007.0 | $142 | $28 | 30.0 | 45.0 | 5/21/2022 | 0.38 | 4.0 | 2.0 | 228.0 |
| 3 | 1002755 | Unknown | unconfirmed | Brooklyn | Clinton Hill | United States | True | moderate | Entire home/apt | 2005.0 | $368 | $74 | 30.0 | 270.0 | 7/5/2019 | 4.64 | 4.0 | 1.0 | 322.0 |
For the numeric columns, each null value in each column is filled with the mean of the column. For the column of 'price' and'service fee', first the '$',',' charachters are removed after which they are convered into float data types so to be used in the mean calculation
df.iloc[:,9:14]
| Construction year | price | service fee | minimum nights | number of reviews | |
|---|---|---|---|---|---|
| 0 | 2020.0 | $966 | $193 | 10.0 | 9.0 |
| 1 | 2007.0 | $142 | $28 | 30.0 | 45.0 |
| 3 | 2005.0 | $368 | $74 | 30.0 | 270.0 |
| 4 | 2009.0 | $204 | $41 | 10.0 | 9.0 |
| 5 | 2013.0 | $577 | $115 | 3.0 | 74.0 |
| ... | ... | ... | ... | ... | ... |
| 102588 | 2016.0 | $618 | $124 | 1.0 | 177.0 |
| 102591 | 2005.0 | $1,139 | $228 | 5.0 | 17.0 |
| 102593 | 2017.0 | $1,099 | $220 | 1.0 | 8.0 |
| 102595 | 2016.0 | $837 | $167 | 1.0 | 1.0 |
| 102597 | 2015.0 | $546 | $109 | 2.0 | 5.0 |
86706 rows × 5 columns
df['price'] = df['price'].str.replace('$', '')
df['price'] = df['price'].str.replace(',', '')
df['service fee'] = df['service fee'].str.replace('$', '')
df['service fee'] = df['service fee'].str.replace(',', '')
df['price'] = df['price'].astype('float')
df['service fee'] = df['service fee'].astype('float')
df.iloc[:,9:14] = df.iloc[:,9:14].fillna(df.iloc[:,9:14].mean())
For the reviews and avaliability columns, similar method is used to fill the null values with the mean of the column.
df.iloc[:,15:] = df.iloc[:,15:].fillna(df.iloc[:,15:].mean())
Each column is again checked for null values
for i in df.columns:
print(df[i].isnull().mean())
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
The following are some visulisations based on the data
nb = df["neighbourhood group"].value_counts()
fig = px.bar(y = nb.values,
x = nb.index,
color = nb.index,
color_discrete_sequence=px.colors.sequential.PuBuGn,
text = nb.values,
title = "Top Neighbourhood group",
template = 'ggplot2')
fig.update_layout(
xaxis_title = "Neighbourhood group",
yaxis_title = "count")
fig.show()
host_ident = df["host_identity_verified"].value_counts()
fig = px.pie(
values = host_ident.values,
names = host_ident.index,
color_discrete_sequence=px.colors.sequential.PuBu,
title= 'Host identity distribution',template = 'ggplot2'
)
fig.show()
fig = px.box(
x=df["host_identity_verified"],
y=df['price'],
template= 'ggplot2',
title = 'Host identity confirmation comparing to the price')
fig.update_layout(
xaxis_title = "Host identity",
yaxis_title = "price",)
fig.show()
rm = df["room type"].value_counts()
fig = px.bar(y = rm.values,
x = rm.index,
color = rm.index,
color_discrete_sequence=px.colors.sequential.PuBuGn,
text = rm.values,
title = "Room type",
template = "ggplot2"
)
fig.update_layout(
xaxis_title = "Room type",
yaxis_title = "count")
fig.show()
fig = px.box(
x=df["room type"],
y=df['price'],
template= 'ggplot2',
title = 'Room type compared to the price')
fig.update_layout(
xaxis_title = "Room type",
yaxis_title = "price")
fig.show()
fig = px.box(
x=df["cancellation_policy"],
y=df['price'],
template= 'ggplot2',
title = 'cancellation policy compared to the price')
fig.update_layout(
xaxis_title = "concellation policy",
yaxis_title = "price")
fig.show()